# Computations
import numpy as np
import pandas as pd
import scipy.stats as stats
# sklearn
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, StratifiedShuffleSplit
from sklearn.feature_selection import RFE
from sklearn import datasets
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com. This data is fictional and it is created by IBM data scientists.
Path = 'Data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx'
Data = pd.read_csv(Path.split(".")[0]+'_STD.csv')
Target = 'Attrition'
Labels_dict = dict(zip([0, 1], ['No', 'Yes']))
display(Data)
Aditional_Columns = [Target, 'Employee Number']
X = Data.drop(columns = Aditional_Columns)
y = Data[Target]
| Age | Attrition | Business Travel | Daily Rate | Department | Distance From Home | Education | Education Field | Employee Number | Environment Satisfaction | ... | Performance Rating | Relationship Satisfaction | Stock Option Level | Total Working Years | Training Times Last Year | Work Life Balance | Years At Company | Years In Current Role | Years Since Last Promotion | Years With Current Manager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.616734 | 1 | 0.590048 | -0.641602 | 1.401512 | -0.990124 | -0.891688 | -0.937414 | 1 | -0.660531 | ... | -0.426230 | -1.584178 | -0.932014 | -0.672516 | -2.171982 | -2.493820 | -0.402918 | -0.745297 | -0.397004 | 1.081402 |
| 1 | 0.616734 | 0 | -0.913194 | -1.611926 | -0.493817 | 1.395913 | -1.868426 | -0.937414 | 2 | 0.254625 | ... | 2.346151 | 1.191438 | 0.241988 | -0.672516 | 0.155707 | 0.338096 | -0.402918 | 1.083729 | -0.397004 | 1.081402 |
| 2 | 0.616734 | 1 | 0.590048 | -0.641602 | -0.493817 | -0.990124 | -0.891688 | 1.316673 | 4 | 1.169781 | ... | -0.426230 | -0.658973 | -0.932014 | -0.672516 | 0.155707 | 0.338096 | -0.402918 | -0.745297 | -0.397004 | -0.737432 |
| 3 | -0.661967 | 0 | -0.913194 | -0.641602 | -0.493817 | -0.990124 | 1.061787 | -0.937414 | 5 | 1.169781 | ... | -0.426230 | 0.266233 | -0.932014 | -0.672516 | 0.155707 | 0.338096 | -0.402918 | 1.083729 | -0.397004 | -0.737432 |
| 4 | -0.661967 | 0 | 0.590048 | 0.328722 | -0.493817 | -0.990124 | -1.868426 | 0.565311 | 7 | -1.575686 | ... | -0.426230 | 1.191438 | 0.241988 | -0.672516 | 0.155707 | 0.338096 | -0.402918 | -0.745297 | -0.397004 | -0.737432 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 0.616734 | 0 | -0.913194 | 1.299046 | -0.493817 | 0.600567 | -0.891688 | 0.565311 | 2061 | 0.254625 | ... | -0.426230 | 0.266233 | 0.241988 | 0.563232 | 0.155707 | 0.338096 | -0.402918 | -0.745297 | -0.397004 | -0.737432 |
| 1466 | 0.616734 | 0 | 0.590048 | 1.299046 | -0.493817 | 1.395913 | -1.868426 | 0.565311 | 2062 | 1.169781 | ... | -0.426230 | -1.584178 | 0.241988 | -0.672516 | 1.707500 | 0.338096 | -0.402918 | 1.083729 | -0.397004 | 1.081402 |
| 1467 | -0.661967 | 0 | 0.590048 | -1.611926 | -0.493817 | -0.990124 | 0.085049 | -0.937414 | 2064 | -0.660531 | ... | 2.346151 | -0.658973 | 0.241988 | -0.672516 | -2.171982 | 0.338096 | -0.402918 | -0.745297 | -0.397004 | -0.737432 |
| 1468 | 0.616734 | 0 | -0.913194 | -0.641602 | 1.401512 | -0.990124 | 0.085049 | 0.565311 | 2065 | 1.169781 | ... | -0.426230 | 1.191438 | -0.932014 | 0.563232 | 0.155707 | -1.077862 | -0.402918 | 1.083729 | -0.397004 | 1.081402 |
| 1469 | -0.661967 | 0 | 0.590048 | 1.299046 | -0.493817 | 1.395913 | 0.085049 | 0.565311 | 2068 | -0.660531 | ... | -0.426230 | -1.584178 | -0.932014 | -0.672516 | 0.155707 | 1.754054 | -0.402918 | -0.745297 | -0.397004 | -0.737432 |
1470 rows × 32 columns
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Pull = [0 for x in range((len(Labels_dict)-1))]
Pull.append(.05)
PD = dict(PieColors = ['SeaGreen','FireBrick'],
TableColors = ['DarkGreen','HoneyDew'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 350, tablecolumnwidth = [0.1, 0.1, 0.1],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = 0.8)
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict = Labels_dict):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= PD['column_widths'],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [ToSeries(y_train).replace(Labels_dict), ToSeries(y_test).replace(Labels_dict)]:
fig.add_trace(go.Pie(labels= list(Labels_dict.values()),
values= y.value_counts().values, pull=PD['pull'],
textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'],
line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole=.5)
fig.update_layout(legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + 'Dataset Distribution' + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD.update(dict(column_widths=[0.3, 0.3, 0.3],
tablecolumnwidth = [0.2, 0.4], height = 350, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD)
In this article, we implement scikit-learn's GaussianNB function which implements the Gaussian Naive Bayes algorithm for classification. The likelihood of the features is assumed to be \begin{align} P(x_i \mid y) = \frac{1}{\sqrt{2\pi\sigma^2_y}} \exp\left(-\frac{(x_i - \mu_y)^2}{2\sigma^2_y}\right) \end{align} The parameters $\sigma_y$ and $\mu_y$ are estimated using maximum likelihood.
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
def Best_Parm(model, param_dist, Top = None, X = X, y = y, n_splits = 20, scoring = 'precision', H = 600, titleY = .95):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=Test_Size, random_state=42),
n_iter = int(1e3), scoring = scoring, error_score = 0, verbose = 0,
n_jobs = 10, return_train_score = True)
_ = grid.fit(X, y)
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
Table = Table.iloc[:Top,:]
# Table
T = Table.copy()
T['Train Score'] = T['Mean Train Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Train Score'].map(lambda x: ('%.2e' % x))
T['Test Score'] = T['Mean Test Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Test Score'].map(lambda x: ('%.2e' % x))
T['Fit Time'] = T['Mean Fit Time'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Fit Time'].map(lambda x: ('%.2e' % x))
T = T.drop(columns = ['Mean Train Score','STD Train Score','Mean Test Score','STD Test Score','Mean Fit Time','STD Fit Time'])
display(T.head(Top).style.hide_index().background_gradient(subset= ['Rank Test Score'],
cmap=sns.diverging_palette(145, 300, s=60, as_cmap=True)).\
set_properties(subset=['Params'], **{'background-color': 'Indigo', 'color': 'White'}).\
set_properties(subset=['Train Score'], **{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Test Score'], **{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Fit Time'], **{'background-color': 'Linen', 'color': 'Black'}))
# Plot
Grid_Performance_Plot(Table, n_splits = n_splits, H = H, titleY = titleY)
return grid
def Grid_Table(grid):
Table = pd.DataFrame({'Rank Test Score': grid.cv_results_['rank_test_score'],
'Params':[str(s).replace('{', '').replace('}', '').\
replace("'", '') for s in grid.cv_results_['params']],
# Train
'Mean Train Score': grid.cv_results_['mean_train_score'],
'STD Train Score': grid.cv_results_['std_train_score'],
# Test
'Mean Test Score': grid.cv_results_['mean_test_score'],
'STD Test Score': grid.cv_results_['std_test_score'],
# Fit time
'Mean Fit Time': grid.cv_results_['mean_fit_time'],
'STD Fit Time': grid.cv_results_['std_fit_time']})
Table = Table.sort_values('Rank Test Score').reset_index(drop = True)
return Table
def Grid_Performance_Plot(Table, n_splits, H = 550, titleY =.95):
Temp = Table['Mean Train Score']-Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']-Table['STD Test Score'])
L = np.floor((Temp*100- Temp)).min()/100
Temp = Table['Mean Train Score']+Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']+Table['STD Test Score'])
R = np.ceil((Temp*100 + Temp)).max()/100
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('<b>' + 'Train Set' + '<b>', '<b>' + 'Test Set' + '<b>'))
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Train Score'], showlegend=False, marker_color= 'SeaGreen',
error_y=dict(type='data',array=Table['STD Train Score'], visible=True)), 1, 1)
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Test Score'], showlegend=False, marker_color= 'RoyalBlue',
error_y=dict(type='data',array= Table['STD Test Score'], visible=True)), 1, 2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= [L, R])
fig.update_yaxes(title_text="Mean Score", row=1, col=1)
fig.update_layout(plot_bgcolor= 'white', width = 980, height = H,
title={'text': '<b>' + 'RandomizedSearchCV with %i-fold cross validation' % n_splits + '<b>',
'x':0.5, 'y':titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def Stratified_CV_Scoring(model, X = X, y = y, n_splits = 10, Labels = list(Labels_dict.values())):
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.Series):
y = y.values
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
_ = model.fit(X_train,y_train)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
return Reports_Train, Reports_Test, CM_Train, CM_Test
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(PD['Labels'])
_ = a.yaxis.set_ticklabels(PD['Labels'])
_ = a.set_aspect(1)
def Train_Test_Scores(CM_Train, CM_Test):
CM = [CM_Train, CM_Test]
Sets = ['Train', 'Test']
Colors = ['Green', 'Blue']
for i in range(2):
Header('%s Set' % Sets[i], C = Colors[i])
tn, fp, fn, tp = CM[i].ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (%s) = %.2f' % (Sets[i], Precision))
print('Recall (%s) = %.2f' % (Sets[i], Recall))
print('TPR (%s) = %.2f' % (Sets[i], TPR))
print('TNR (%s) = %.2f' % (Sets[i], TNR))
print('Balanced Accuracy (%s) = %.2f' % (Sets[i], BA))
Line()
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [4] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Name = 'Gaussian Naive-Bayes'
Header('%s with Default Parameters' % Name)
n_splits = 20
GNB= GaussianNB()
print('Default Parameters = %s' % GNB.get_params(deep=True))
_ = GNB.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(GNB, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (10, 5), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
Train_Test_Scores(CM_Train, CM_Test)
Gaussian Naive-Bayes with Default Parameters ======================================================= Default Parameters = {'priors': None, 'var_smoothing': 1e-09}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9170 ± 0.0063 | 0.8684 ± 0.0297 | 0.8917 ± 0.0136 | 863.0000 ± 0.0000 |
| Yes | 0.4691 ± 0.0409 | 0.5904 ± 0.0450 | 0.5198 ± 0.0164 | 166.0000 ± 0.0000 |
| accuracy | 0.8236 ± 0.0188 | 0.8236 ± 0.0188 | 0.8236 ± 0.0188 | 0.8236 ± 0.0188 |
| macro avg | 0.6931 ± 0.0187 | 0.7294 ± 0.0116 | 0.7058 ± 0.0137 | 1029.0000 ± 0.0000 |
| weighted avg | 0.8448 ± 0.0054 | 0.8236 ± 0.0188 | 0.8317 ± 0.0133 | 1029.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9103 ± 0.0083 | 0.8611 ± 0.0350 | 0.8847 ± 0.0194 | 370.0000 ± 0.0000 |
| Yes | 0.4438 ± 0.0642 | 0.5577 ± 0.0462 | 0.4910 ± 0.0420 | 71.0000 ± 0.0000 |
| accuracy | 0.8122 ± 0.0283 | 0.8122 ± 0.0283 | 0.8122 ± 0.0283 | 0.8122 ± 0.0283 |
| macro avg | 0.6771 ± 0.0339 | 0.7094 ± 0.0249 | 0.6878 ± 0.0297 | 441.0000 ± 0.0000 |
| weighted avg | 0.8352 ± 0.0145 | 0.8122 ± 0.0283 | 0.8213 ± 0.0223 | 441.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.46 Recall (Train) = 0.59 TPR (Train) = 0.59 TNR (Train) = 0.87 Balanced Accuracy (Train) = 0.73 Test Set =========================================================================================== Precision (Test) = 0.44 Recall (Test) = 0.56 TPR (Test) = 0.56 TNR (Test) = 0.86 Balanced Accuracy (Test) = 0.71 ====================================================================================================
In order to find the parameters for our model, we can sue RandomizedSearchCV. Here, we have defined a function Best_Parm to find the best parameters.
GNB= GaussianNB()
Priors = [np.array([x, 1- x]) for x in np.arange(.25, 1, .25)]
Priors.append((y.value_counts().values/y.count()).round(2))
param_dist = {'priors': Priors, 'var_smoothing': [10**(-x) for x in range(1,11,3)]}
del Priors
Header('%s with the Best Parameters' % Name)
grid = Best_Parm(model = GNB, param_dist = param_dist, Top = 20, H = 650, titleY =.96)
Gaussian Naive-Bayes with the Best Parameters ======================================================
| Rank Test Score | Params | Train Score | Test Score | Fit Time |
|---|---|---|---|---|
| 1 | var_smoothing: 0.1, priors: array([0.84, 0.16]) | 5.15e-01 ± 5.17e-02 | 4.82e-01 ± 7.46e-02 | 2.25e-03 ± 4.33e-04 |
| 2 | var_smoothing: 0.0001, priors: array([0.84, 0.16]) | 4.71e-01 ± 4.07e-02 | 4.45e-01 ± 6.36e-02 | 2.05e-03 ± 4.98e-04 |
| 3 | var_smoothing: 1e-07, priors: array([0.84, 0.16]) | 4.71e-01 ± 4.07e-02 | 4.45e-01 ± 6.38e-02 | 2.10e-03 ± 5.39e-04 |
| 3 | var_smoothing: 1e-10, priors: array([0.84, 0.16]) | 4.71e-01 ± 4.07e-02 | 4.45e-01 ± 6.38e-02 | 2.10e-03 ± 3.00e-04 |
| 5 | var_smoothing: 0.1, priors: array([0.75, 0.25]) | 4.19e-01 ± 3.44e-02 | 3.94e-01 ± 5.04e-02 | 2.10e-03 ± 5.39e-04 |
| 6 | var_smoothing: 0.0001, priors: array([0.75, 0.25]) | 3.96e-01 ± 3.27e-02 | 3.72e-01 ± 4.26e-02 | 2.15e-03 ± 5.72e-04 |
| 6 | var_smoothing: 1e-07, priors: array([0.75, 0.25]) | 3.96e-01 ± 3.27e-02 | 3.72e-01 ± 4.26e-02 | 1.87e-03 ± 3.72e-04 |
| 6 | var_smoothing: 1e-10, priors: array([0.75, 0.25]) | 3.96e-01 ± 3.27e-02 | 3.72e-01 ± 4.26e-02 | 2.20e-03 ± 6.78e-04 |
| 9 | var_smoothing: 0.1, priors: array([0.5, 0.5]) | 2.99e-01 ± 1.62e-02 | 2.84e-01 ± 2.10e-02 | 1.85e-03 ± 3.57e-04 |
| 10 | var_smoothing: 1e-07, priors: array([0.5, 0.5]) | 2.97e-01 ± 1.53e-02 | 2.83e-01 ± 2.10e-02 | 2.15e-03 ± 3.57e-04 |
| 10 | var_smoothing: 1e-10, priors: array([0.5, 0.5]) | 2.97e-01 ± 1.53e-02 | 2.83e-01 ± 2.10e-02 | 2.00e-03 ± 3.16e-04 |
| 12 | var_smoothing: 0.0001, priors: array([0.5, 0.5]) | 2.97e-01 ± 1.53e-02 | 2.83e-01 ± 2.08e-02 | 1.95e-03 ± 4.98e-04 |
| 13 | var_smoothing: 0.0001, priors: array([0.25, 0.75]) | 2.40e-01 ± 6.12e-03 | 2.33e-01 ± 1.63e-02 | 2.25e-03 ± 4.33e-04 |
| 13 | var_smoothing: 1e-07, priors: array([0.25, 0.75]) | 2.40e-01 ± 6.12e-03 | 2.33e-01 ± 1.63e-02 | 2.10e-03 ± 5.39e-04 |
| 13 | var_smoothing: 1e-10, priors: array([0.25, 0.75]) | 2.40e-01 ± 6.12e-03 | 2.33e-01 ± 1.63e-02 | 2.10e-03 ± 3.00e-04 |
| 16 | var_smoothing: 0.1, priors: array([0.25, 0.75]) | 2.36e-01 ± 5.00e-03 | 2.29e-01 ± 1.42e-02 | 2.60e-03 ± 4.90e-04 |
Since we have identified the best parameters for our modeling, we train another model using these parameters.
Header('%s with the Best Parameters' % Name)
GNB = GaussianNB(**grid.best_params_)
print('Default Parameters = %s' % GNB.get_params(deep=True))
_ = GNB.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(GNB, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'DarkGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'MediumBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
Train_Test_Scores(CM_Train, CM_Test)
Gaussian Naive-Bayes with the Best Parameters ====================================================== Default Parameters = {'priors': array([0.84, 0.16]), 'var_smoothing': 0.1}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9111 ± 0.0072 | 0.8983 ± 0.0285 | 0.9043 ± 0.0121 | 863.0000 ± 0.0000 |
| Yes | 0.5153 ± 0.0517 | 0.5431 ± 0.0512 | 0.5245 ± 0.0185 | 166.0000 ± 0.0000 |
| accuracy | 0.8410 ± 0.0172 | 0.8410 ± 0.0172 | 0.8410 ± 0.0172 | 0.8410 ± 0.0172 |
| macro avg | 0.7132 ± 0.0239 | 0.7207 ± 0.0150 | 0.7144 ± 0.0133 | 1029.0000 ± 0.0000 |
| weighted avg | 0.8472 ± 0.0068 | 0.8410 ± 0.0172 | 0.8430 ± 0.0119 | 1029.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| No | 0.9050 ± 0.0076 | 0.8897 ± 0.0309 | 0.8971 ± 0.0166 | 370.0000 ± 0.0000 |
| Yes | 0.4822 ± 0.0746 | 0.5134 ± 0.0442 | 0.4936 ± 0.0434 | 71.0000 ± 0.0000 |
| accuracy | 0.8291 ± 0.0250 | 0.8291 ± 0.0250 | 0.8291 ± 0.0250 | 0.8291 ± 0.0250 |
| macro avg | 0.6936 ± 0.0389 | 0.7016 ± 0.0234 | 0.6953 ± 0.0290 | 441.0000 ± 0.0000 |
| weighted avg | 0.8370 ± 0.0156 | 0.8291 ± 0.0250 | 0.8321 ± 0.0201 | 441.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.51 Recall (Train) = 0.54 TPR (Train) = 0.54 TNR (Train) = 0.90 Balanced Accuracy (Train) = 0.72 Test Set =========================================================================================== Precision (Test) = 0.47 Recall (Test) = 0.51 TPR (Test) = 0.51 TNR (Test) = 0.89 Balanced Accuracy (Test) = 0.70 ====================================================================================================